#####################################################################################################################
# CREATE REGRESSION TABLES THAT REFLECT RESULTS OF COEFFICIENTS PLOTTED ON THE IGNORABILITY COVARIANCE CHECK FIGURE #
#####################################################################################################################
# Author: Kasia Nalewajko
# First created: 09 June 2024

rm(list = ls())

# LOAD PACKAGES -----------------------------------------------------------

if (!require("dplyr")) install.packages("dplyr")
if (!require("estimatr")) install.packages("estimatr")
if (!require("haven")) install.packages("haven")
if (!require("stargazer")) install.packages("stargazer")


# LOAD DATA ---------------------------------------------------------------

load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/main.Rda")
data_1872 <- read_dta("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/census1872small.dta")
more_1872 <- read_dta("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/canton_dataset.dta")
load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/communes_deppct.Rda")
cantonsxy <- read_dta("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/cantonsxy.dta")
cantons_sf <- sf::read_sf("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/Gay_CANTONS_1940/CANTONS_1940.shp")

# CLEAN ----
# ADD SQUICCIARINI TO MAIN DATA FRAME ------

# Assign Squicciarini units to 1940 counties
# overlay the coordinates with dept codes
squi_coords <- unique(cantonsxy[, c("X", "Y")])

# create a points collection
squi_coords_sf <- do.call("st_sfc",c(lapply(1:nrow(squi_coords), # `LONG` needs to be first
                                            function(i) {sf::st_point(as.numeric(squi_coords[i, ]))}), list("crs" = 4326)))

squi_coords_trans <- sf::st_transform(squi_coords_sf, 2163) # apply transformation to coords sf
cantons_trans <- sf::st_transform(cantons_sf, 2163) # apply transformation to polygons sf

# intersect and extract departement canton code
squi_coords$deppct <- apply(st_intersects(cantons_trans, squi_coords_trans, sparse = FALSE), 2,
                            function(col) {
                              cantons_trans[which(col), ]$deppct
                            })

sum(stringr::str_count(string = squi_coords$deppct, pattern = "numeric")) # 6 missing

squi_coords$deppct <- as.numeric(squi_coords$deppct)
squi_coords <- squi_coords %>% filter(!is.na(deppct))

temp <- left_join(cantonsxy, squi_coords) %>% 
  dplyr::select(ID_canton, deppct) %>% 
  filter(!is.na(deppct))

more_1872 <- left_join(more_1872, temp) %>% 
  filter(!is.na(deppct)) %>% 
  group_by(deppct) %>% 
  summarise(HH_expend1901 = mean(HH_expend, na.rm = T),
            share_cath_schools1894 = mean(share_cath_schools1894, na.rm = T),
            schools_total1894 = mean(exp(lnschools_total1894), na.rm = T)) %>% 
  mutate_all(~ifelse(is.nan(.), NA, .))

main <- left_join(main, more_1872)

# ADD 1872 CENSUS TO MAIN DATA FRAME -------

census1872_deppct <- left_join(data_1872, communes_deppct) %>% 
  mutate(pop1872 = exp(logtotalpop1872),
         male1872 = share_male1872*pop1872,
         unemployed1872 = share_unemployment1872*pop1872,
         liberalprofs1872 = share_liberals1872*pop1872,
         trade1872 = share_merchants1872*pop1872,
         industry1872 = share_industry1872*pop1872,
         agriculture1872 = share_agriculture1872*pop1872
  ) %>% 
  group_by(deppct) %>% 
  summarise(pop1872 = sum(pop1872, na.rm = T),
            male1872 = sum(male1872, na.rm = T),
            unemployed1872 = sum(unemployed1872, na.rm = T),
            liberalprofs1872 = sum(liberalprofs1872, na.rm = T),
            trade1872 = sum(trade1872, na.rm = T),
            industry1872 = sum(industry1872, na.rm = T),
            agriculture1872 = sum(agriculture1872, na.rm = T),
            male1872_share = male1872/pop1872,
            unemployed1872_share = unemployed1872/pop1872,
            liberalprofs1872_share = liberalprofs1872/pop1872,
            trade1872_share = trade1872/pop1872,
            industry1872_share = industry1872/pop1872,
            agriculture1872_share = agriculture1872/pop1872
  ) %>% 
  dplyr::select(deppct, male1872_share, unemployed1872_share, liberalprofs1872_share, trade1872_share, industry1872_share, agriculture1872_share) %>% 
  filter(!is.na(deppct))

main <- left_join(main, census1872_deppct)

# ADD WWI BATTLES TO MAIN DATA FRAME ----

main$verdun <- ifelse(main$months_Verdun > 0, 1, NA)
main$verdun <- ifelse(main$months_Verdun == 0, 0, main$verdun)

patterns <- c("14_pct$", "1872_share$")
covariates <- grep(pattern = paste(patterns, collapse = "|"), x = names(main), value = T)
covariates <- c(covariates, "verdun", "marne", "somme", "aisne", "chemindesdames", "OutsideFrance")


# RUN MODELS --------------------------------------------------------------

# Generate vector of outcomes
outcomes <- "mpf1000"

# Loop over outcomes and covariates
for(i in 1:length(outcomes)){
  models <- list()
  
  # Define covariates
  covariates <- list(
    "turnout_14_pct + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(empty_ballots_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(SFIO_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(REPSOC_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(RADSOC_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(RADIND_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(PRDS_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(Progressistes_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(ALP_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(Divers_14_pct+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "verdun + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "marne + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "somme + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "aisne + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "chemindesdames + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "OutsideFrance + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(unemployed1872_share+1) + male1872_share + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(liberalprofs1872_share+1) + male1872_share + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(trade1872_share+1) + male1872_share + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(industry1872_share+1) + male1872_share + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(agriculture1872_share+1) + male1872_share + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(HH_expend1901+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(schools_total1894+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq",
    "log(share_cath_schools1894+1) + log(pop1911) + area_sqkm + longitude + longitudesq + latitude + latitudesq"
  )
  
  # Loop through each set of covariates
  for (j in 1:length(covariates)) {
    formula <- as.formula(paste(outcomes[i], "~", covariates[[j]]))
    models[[j]] <- lm_robust(formula, clusters = id_bureau, fixed_effects = as.factor(ar_name), data = main)
  }
}

# Export models with electoral indicators ----

modelsummary(list(models[[1]], models[[2]], models[[3]], models[[4]], models[[5]],
                  models[[6]], models[[7]], models[[8]], models[[9]], models[[10]]),
             stars = c('*' = .1, '**' = .05, '***' = .01),
             output = "latex",
             gof_omit = "AIC|BIC|RMSE|R2 Within|R2 Within Adj.",
             title = "Covariate balance check: ignorability (1914 Electoral indicators)",
             coef_rename = c("log(pop1911)" = "1911 population",
                             "area_sqkm" = "Area size (km2)",
                             "longitude" = "Longitude",
                             "latitude" = "Latitude",
                             "longitudesq" = "Longitude (sq)",
                             "latitudesq" = "Latitude (sq)",
                             "turnout_14_pct" = "Turnout",
                             "log(empty_ballots_14_pct + 1)" = "Casted empty ballots",
                             "log(SFIO_14_pct + 1)" = "SFIO votes",
                             "log(REPSOC_14_pct + 1)" = "PRS votes",
                             "log(RADSOC_14_pct + 1)" = "Rad. Socialist votes",
                             "log(RADIND_14_pct + 1)" = "Rad. Indep. votes",
                             "log(PRDS_14_pct + 1)" = "PRDS votes",
                             "log(Progressistes_14_pct + 1)" = "Progressistes votes",
                             "log(ALP_14_pct + 1)" = "ALP votes",
                             "log(Divers_14_pct + 1)" = "Other votes"
             ))

# Export models with WWI battles indicators ----

modelsummary(list(models[[11]], models[[12]], models[[13]], models[[14]], models[[15]],
                                  models[[16]]),
                             stars = c('*' = .1, '**' = .05, '***' = .01),
                             output = "latex",
                             gof_omit = "AIC|BIC|RMSE|R2 Within|R2 Within Adj.",
                             title = "Covariate balance check: ignorability (WWI Battles)",
                             coef_rename = c("log(pop1911)" = "1911 population",
                                             "area_sqkm" = "Area size (km2)",
                                             "longitude" = "Longitude",
                                             "latitude" = "Latitude",
                                             "longitudesq" = "Longitude (sq)",
                                             "latitudesq" = "Latitude (sq)",
                                             "verdun" = "Fought at Verdun",
                                             "marne" = "Fought at Marne",
                                             "somme" = "Fought at Somme",
                                             "aisne" = "Fought at Aisne",
                                             "chemindesdames" = "Fought at Chemin des Dames",
                                             "OutsideFrance" = "Fought outside France"
                             ))

# Export models with 1872 census indicators ----

modelsummary(list(models[[17]], models[[18]], models[[19]], models[[20]], models[[21]]),
             stars = c('*' = .1, '**' = .05, '***' = .01),
             output = "latex",
             gof_omit = "AIC|BIC|RMSE|R2 Within|R2 Within Adj.",
             title = "Covariate balance check: ignorability (1872 census)",
             coef_rename = c("log(pop1911)" = "1911 population",
                             "area_sqkm" = "Area size (km2)",
                             "longitude" = "Longitude",
                             "latitude" = "Latitude",
                             "longitudesq" = "Longitude (sq)",
                             "latitudesq" = "Latitude (sq)",
                             "log(unemployed1872_share + 1)" = "Share unemployed",
                             "log(liberalprofs1872_share + 1)" = "Share in liberal occup.",
                             "log(trade1872_share + 1)" = "Share in trade",
                             "log(industry1872_share + 1)" = "Share in industry",
                             "log(agriculture1872_share + 1)" = "Share in agricul.",
                             "male1872_share" = "Share of men"
             ))

# Export models with other historical indicators ----

modelsummary(list(models[[22]], models[[23]], models[[24]]),
             stars = c('*' = .1, '**' = .05, '***' = .01),
             output = "latex",
             gof_omit = "AIC|BIC|RMSE|R2 Within|R2 Within Adj.",
             title = "Covariate balance check: ignorability (Other historical indicators)",
             coef_rename = c("log(pop1911)" = "1911 population",
                             "area_sqkm" = "Area size (km2)",
                             "longitude" = "Longitude",
                             "latitude" = "Latitude",
                             "longitudesq" = "Longitude (sq)",
                             "latitudesq" = "Latitude (sq)",
                             "log(HH_expend1901 + 1)" = "Avg. household expend. (1901)",
                             "log(schools_total1894 + 1)" = "Number schools (1894)",
                             "log(share_cath_schools1894 + 1)" = "Share cath. schools (1894)"
             ))





